BANNED_HASH = {
    "d01c1e79721b1ca8a9801ace27dc99226a5e7526"
}


def skip_sample(hash):
    return hash in BANNED_HASH


def clean_nonbpe_tokens(nonbpe_tokens, lang):

    if lang == "c":
        return clean_C_nonbpe_tokens(nonbpe_tokens)
    elif lang == "go":
        return clean_Go_nonbpe_tokens(nonbpe_tokens)
    elif lang == "javascript":
        return clean_javascript_nonbpe_tokens(nonbpe_tokens)
    elif lang == "ruby":
        return clean_ruby_nonbpe_tokens(nonbpe_tokens)

    return nonbpe_tokens
    

def clean_C_nonbpe_tokens(nonbpe_tokens):

    def keep_token(tok):
        if "\r" in tok or "\n" in tok or "\t" in tok:
            return False
        
        return True
    
    nonbpe_tokens = [tok for tok in nonbpe_tokens if keep_token(tok)]
    return nonbpe_tokens


def clean_Go_nonbpe_tokens(nonbpe_tokens):

    def keep_token(tok):
        if "\r" in tok or "\n" in tok or "\t" in tok:
            return False
        
        return True
    
    nonbpe_tokens = [tok for tok in nonbpe_tokens if keep_token(tok)]
    return nonbpe_tokens


def clean_javascript_nonbpe_tokens(nonbpe_tokens):

    def keep_token(tok):
        if "\r" in tok or "\n" in tok or "\t" in tok:
            return False
        if tok.startswith("#"):
            return False
        if tok.startswith("\\"):
            return False
        
        return True
    
    nonbpe_tokens = [tok for tok in nonbpe_tokens if keep_token(tok)]
    return nonbpe_tokens


def clean_ruby_nonbpe_tokens(nonbpe_tokens):

    def keep_token(tok):
        if "\r" in tok or "\n" in tok or "\t" in tok:
            return False
        if tok.startswith("?"):
            return False
        
        return True
    
    nonbpe_tokens = [tok for tok in nonbpe_tokens if keep_token(tok)]
    return nonbpe_tokens